Mice Cortex Analysis¶
Retrieving and Preparing the Data¶
#import warnings
#warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
data = pd.read_csv('Data_Cortex_Nuclear.csv', sep = ',')
data.head()
| MouseID | DYRK1A_N | ITSN1_N | BDNF_N | NR1_N | NR2A_N | pAKT_N | pBRAF_N | pCAMKII_N | pCREB_N | pELK_N | pERK_N | pJNK_N | PKCA_N | pMEK_N | pNR1_N | pNR2A_N | pNR2B_N | pPKCAB_N | pRSK_N | AKT_N | BRAF_N | CAMKII_N | CREB_N | ELK_N | ERK_N | GSK3B_N | JNK_N | MEK_N | TRKA_N | RSK_N | APP_N | Bcatenin_N | SOD1_N | MTOR_N | P38_N | pMTOR_N | DSCR1_N | AMPKA_N | NR2B_N | pNUMB_N | RAPTOR_N | TIAM1_N | pP70S6_N | NUMB_N | P70S6_N | pGSK3B_N | pPKCG_N | CDK5_N | S6_N | ADARB1_N | AcetylH3K9_N | RRP1_N | BAX_N | ARC_N | ERBB4_N | nNOS_N | Tau_N | GFAP_N | GluR3_N | GluR4_N | IL1B_N | P3525_N | pCASP9_N | PSD95_N | SNCA_N | Ubiquitin_N | pGSK3B_Tyr216_N | SHH_N | BAD_N | BCL2_N | pS6_N | pCFOS_N | SYP_N | H3AcK18_N | EGR1_N | H3MeK4_N | CaNA_N | Genotype | Treatment | Behavior | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 309_1 | 0.503644 | 0.747193 | 0.430175 | 2.816329 | 5.990152 | 0.218830 | 0.177565 | 2.373744 | 0.232224 | 1.750936 | 0.687906 | 0.306382 | 0.402698 | 0.296927 | 1.022060 | 0.605673 | 1.877684 | 2.308745 | 0.441599 | 0.859366 | 0.416289 | 0.369608 | 0.178944 | 1.866358 | 3.685247 | 1.537227 | 0.264526 | 0.319677 | 0.813866 | 0.165846 | 0.453910 | 3.037621 | 0.369510 | 0.458539 | 0.335336 | 0.825192 | 0.576916 | 0.448099 | 0.586271 | 0.394721 | 0.339571 | 0.482864 | 0.294170 | 0.182150 | 0.842725 | 0.192608 | 1.443091 | 0.294700 | 0.354605 | 1.339070 | 0.170119 | 0.159102 | 0.188852 | 0.106305 | 0.144989 | 0.176668 | 0.125190 | 0.115291 | 0.228043 | 0.142756 | 0.430957 | 0.247538 | 1.603310 | 2.014875 | 0.108234 | 1.044979 | 0.831557 | 0.188852 | 0.122652 | NaN | 0.106305 | 0.108336 | 0.427099 | 0.114783 | 0.131790 | 0.128186 | 1.675652 | Control | Memantine | C/S | c-CS-m |
| 1 | 309_2 | 0.514617 | 0.689064 | 0.411770 | 2.789514 | 5.685038 | 0.211636 | 0.172817 | 2.292150 | 0.226972 | 1.596377 | 0.695006 | 0.299051 | 0.385987 | 0.281319 | 0.956676 | 0.587559 | 1.725774 | 2.043037 | 0.445222 | 0.834659 | 0.400364 | 0.356178 | 0.173680 | 1.761047 | 3.485287 | 1.509249 | 0.255727 | 0.304419 | 0.780504 | 0.157194 | 0.430940 | 2.921882 | 0.342279 | 0.423560 | 0.324835 | 0.761718 | 0.545097 | 0.420876 | 0.545097 | 0.368255 | 0.321959 | 0.454519 | 0.276431 | 0.182086 | 0.847615 | 0.194815 | 1.439460 | 0.294060 | 0.354548 | 1.306323 | 0.171427 | 0.158129 | 0.184570 | 0.106592 | 0.150471 | 0.178309 | 0.134275 | 0.118235 | 0.238073 | 0.142037 | 0.457156 | 0.257632 | 1.671738 | 2.004605 | 0.109749 | 1.009883 | 0.849270 | 0.200404 | 0.116682 | NaN | 0.106592 | 0.104315 | 0.441581 | 0.111974 | 0.135103 | 0.131119 | 1.743610 | Control | Memantine | C/S | c-CS-m |
| 2 | 309_3 | 0.509183 | 0.730247 | 0.418309 | 2.687201 | 5.622059 | 0.209011 | 0.175722 | 2.283337 | 0.230247 | 1.561316 | 0.677348 | 0.291276 | 0.381002 | 0.281710 | 1.003635 | 0.602449 | 1.731873 | 2.017984 | 0.467668 | 0.814329 | 0.399847 | 0.368089 | 0.173905 | 1.765544 | 3.571456 | 1.501244 | 0.259614 | 0.311747 | 0.785154 | 0.160895 | 0.423187 | 2.944136 | 0.343696 | 0.425005 | 0.324852 | 0.757031 | 0.543620 | 0.404630 | 0.552994 | 0.363880 | 0.313086 | 0.447197 | 0.256648 | 0.184388 | 0.856166 | 0.200737 | 1.524364 | 0.301881 | 0.386087 | 1.279600 | 0.185456 | 0.148696 | 0.190532 | 0.108303 | 0.145330 | 0.176213 | 0.132560 | 0.117760 | 0.244817 | 0.142445 | 0.510472 | 0.255343 | 1.663550 | 2.016831 | 0.108196 | 0.996848 | 0.846709 | 0.193685 | 0.118508 | NaN | 0.108303 | 0.106219 | 0.435777 | 0.111883 | 0.133362 | 0.127431 | 1.926427 | Control | Memantine | C/S | c-CS-m |
| 3 | 309_4 | 0.442107 | 0.617076 | 0.358626 | 2.466947 | 4.979503 | 0.222886 | 0.176463 | 2.152301 | 0.207004 | 1.595086 | 0.583277 | 0.296729 | 0.377087 | 0.313832 | 0.875390 | 0.520293 | 1.566852 | 2.132754 | 0.477671 | 0.727705 | 0.385639 | 0.362970 | 0.179449 | 1.286277 | 2.970137 | 1.419710 | 0.259536 | 0.279218 | 0.734492 | 0.162210 | 0.410615 | 2.500204 | 0.344509 | 0.429211 | 0.330121 | 0.746980 | 0.546763 | 0.386860 | 0.547849 | 0.366771 | 0.328492 | 0.442650 | 0.398534 | 0.161768 | 0.760234 | 0.184169 | 1.612382 | 0.296382 | 0.290680 | 1.198765 | 0.159799 | 0.166112 | 0.185323 | 0.103184 | 0.140656 | 0.163804 | 0.123210 | 0.117439 | 0.234947 | 0.145068 | 0.430996 | 0.251103 | 1.484624 | 1.957233 | 0.119883 | 0.990225 | 0.833277 | 0.192112 | 0.132781 | NaN | 0.103184 | 0.111262 | 0.391691 | 0.130405 | 0.147444 | 0.146901 | 1.700563 | Control | Memantine | C/S | c-CS-m |
| 4 | 309_5 | 0.434940 | 0.617430 | 0.358802 | 2.365785 | 4.718679 | 0.213106 | 0.173627 | 2.134014 | 0.192158 | 1.504230 | 0.550960 | 0.286961 | 0.363502 | 0.277964 | 0.864912 | 0.507990 | 1.480059 | 2.013697 | 0.483416 | 0.687794 | 0.367531 | 0.355311 | 0.174836 | 1.324695 | 2.896334 | 1.359876 | 0.250705 | 0.273667 | 0.702699 | 0.154827 | 0.398550 | 2.456560 | 0.329126 | 0.408755 | 0.313415 | 0.691956 | 0.536860 | 0.360816 | 0.512824 | 0.351551 | 0.312206 | 0.419095 | 0.393447 | 0.160200 | 0.768113 | 0.185718 | 1.645807 | 0.296829 | 0.309345 | 1.206995 | 0.164650 | 0.160687 | 0.188221 | 0.104784 | 0.141983 | 0.167710 | 0.136838 | 0.116048 | 0.255528 | 0.140871 | 0.481227 | 0.251773 | 1.534835 | 2.009109 | 0.119524 | 0.997775 | 0.878668 | 0.205604 | 0.129954 | NaN | 0.104784 | 0.110694 | 0.434154 | 0.118481 | 0.140314 | 0.148380 | 1.839730 | Control | Memantine | C/S | c-CS-m |
print ("Rows: ",data.shape[0])
print ("Columns: ",data.shape[1])
Rows: 1080
Columns: 82
An overview of data size and the first 5 rows shows that no information is lost after retrieving from UCI. Column names are:
Mouse ID
2-78. Values of expression levels of 77 proteins; the names of proteins are followed by
_Nindicating that they were measured in the nuclear fraction.Genotype: control (c) or trisomy (t)
Treatment type: memantine (m) or saline (s)
Behavior: context-shock (CS) or shock-context (SC)
Class: c-CS-s, c-CS-m, c-SC-s, c-SC-m, t-CS-s, t-CS-m, t-SC-s, t-SC-m
print("Columns: \n",data.columns.tolist())
Columns:
['MouseID', 'DYRK1A_N', 'ITSN1_N', 'BDNF_N', 'NR1_N', 'NR2A_N', 'pAKT_N', 'pBRAF_N', 'pCAMKII_N', 'pCREB_N', 'pELK_N', 'pERK_N', 'pJNK_N', 'PKCA_N', 'pMEK_N', 'pNR1_N', 'pNR2A_N', 'pNR2B_N', 'pPKCAB_N', 'pRSK_N', 'AKT_N', 'BRAF_N', 'CAMKII_N', 'CREB_N', 'ELK_N', 'ERK_N', 'GSK3B_N', 'JNK_N', 'MEK_N', 'TRKA_N', 'RSK_N', 'APP_N', 'Bcatenin_N', 'SOD1_N', 'MTOR_N', 'P38_N', 'pMTOR_N', 'DSCR1_N', 'AMPKA_N', 'NR2B_N', 'pNUMB_N', 'RAPTOR_N', 'TIAM1_N', 'pP70S6_N', 'NUMB_N', 'P70S6_N', 'pGSK3B_N', 'pPKCG_N', 'CDK5_N', 'S6_N', 'ADARB1_N', 'AcetylH3K9_N', 'RRP1_N', 'BAX_N', 'ARC_N', 'ERBB4_N', 'nNOS_N', 'Tau_N', 'GFAP_N', 'GluR3_N', 'GluR4_N', 'IL1B_N', 'P3525_N', 'pCASP9_N', 'PSD95_N', 'SNCA_N', 'Ubiquitin_N', 'pGSK3B_Tyr216_N', 'SHH_N', 'BAD_N', 'BCL2_N', 'pS6_N', 'pCFOS_N', 'SYP_N', 'H3AcK18_N', 'EGR1_N', 'H3MeK4_N', 'CaNA_N', 'Genotype', 'Treatment', 'Behavior', 'class']
# check data types and missing values
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1080 entries, 0 to 1079
Data columns (total 82 columns):
MouseID 1080 non-null object
DYRK1A_N 1077 non-null float64
ITSN1_N 1077 non-null float64
BDNF_N 1077 non-null float64
NR1_N 1077 non-null float64
NR2A_N 1077 non-null float64
pAKT_N 1077 non-null float64
pBRAF_N 1077 non-null float64
pCAMKII_N 1077 non-null float64
pCREB_N 1077 non-null float64
pELK_N 1077 non-null float64
pERK_N 1077 non-null float64
pJNK_N 1077 non-null float64
PKCA_N 1077 non-null float64
pMEK_N 1077 non-null float64
pNR1_N 1077 non-null float64
pNR2A_N 1077 non-null float64
pNR2B_N 1077 non-null float64
pPKCAB_N 1077 non-null float64
pRSK_N 1077 non-null float64
AKT_N 1077 non-null float64
BRAF_N 1077 non-null float64
CAMKII_N 1077 non-null float64
CREB_N 1077 non-null float64
ELK_N 1062 non-null float64
ERK_N 1077 non-null float64
GSK3B_N 1077 non-null float64
JNK_N 1077 non-null float64
MEK_N 1073 non-null float64
TRKA_N 1077 non-null float64
RSK_N 1077 non-null float64
APP_N 1077 non-null float64
Bcatenin_N 1062 non-null float64
SOD1_N 1077 non-null float64
MTOR_N 1077 non-null float64
P38_N 1077 non-null float64
pMTOR_N 1077 non-null float64
DSCR1_N 1077 non-null float64
AMPKA_N 1077 non-null float64
NR2B_N 1077 non-null float64
pNUMB_N 1077 non-null float64
RAPTOR_N 1077 non-null float64
TIAM1_N 1077 non-null float64
pP70S6_N 1077 non-null float64
NUMB_N 1080 non-null float64
P70S6_N 1080 non-null float64
pGSK3B_N 1080 non-null float64
pPKCG_N 1080 non-null float64
CDK5_N 1080 non-null float64
S6_N 1080 non-null float64
ADARB1_N 1080 non-null float64
AcetylH3K9_N 1080 non-null float64
RRP1_N 1080 non-null float64
BAX_N 1080 non-null float64
ARC_N 1080 non-null float64
ERBB4_N 1080 non-null float64
nNOS_N 1080 non-null float64
Tau_N 1080 non-null float64
GFAP_N 1080 non-null float64
GluR3_N 1080 non-null float64
GluR4_N 1080 non-null float64
IL1B_N 1080 non-null float64
P3525_N 1080 non-null float64
pCASP9_N 1080 non-null float64
PSD95_N 1080 non-null float64
SNCA_N 1080 non-null float64
Ubiquitin_N 1080 non-null float64
pGSK3B_Tyr216_N 1080 non-null float64
SHH_N 1080 non-null float64
BAD_N 867 non-null float64
BCL2_N 795 non-null float64
pS6_N 1080 non-null float64
pCFOS_N 1005 non-null float64
SYP_N 1080 non-null float64
H3AcK18_N 900 non-null float64
EGR1_N 870 non-null float64
H3MeK4_N 810 non-null float64
CaNA_N 1080 non-null float64
Genotype 1080 non-null object
Treatment 1080 non-null object
Behavior 1080 non-null object
class 1080 non-null object
dtypes: float64(77), object(5)
memory usage: 692.0+ KB
data.isnull().sum()
MouseID 0
DYRK1A_N 3
ITSN1_N 3
BDNF_N 3
NR1_N 3
..
CaNA_N 0
Genotype 0
Treatment 0
Behavior 0
class 0
Length: 82, dtype: int64
Data types are all reasonable, but missing values exist. There are many protein columns that have 3 missing values, which indicate empty rows? As it only accounts for 0.28% of the rows, it’s safe to remove.
DYRK1A_percent_missing = data['DYRK1A_N'].isnull().sum() * 100 / len(data)
DYRK1A_percent_missing
0.2777777777777778
data1 = data[data['DYRK1A_N'].notna()]
data1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1077 entries, 0 to 1079
Data columns (total 82 columns):
MouseID 1077 non-null object
DYRK1A_N 1077 non-null float64
ITSN1_N 1077 non-null float64
BDNF_N 1077 non-null float64
NR1_N 1077 non-null float64
NR2A_N 1077 non-null float64
pAKT_N 1077 non-null float64
pBRAF_N 1077 non-null float64
pCAMKII_N 1077 non-null float64
pCREB_N 1077 non-null float64
pELK_N 1077 non-null float64
pERK_N 1077 non-null float64
pJNK_N 1077 non-null float64
PKCA_N 1077 non-null float64
pMEK_N 1077 non-null float64
pNR1_N 1077 non-null float64
pNR2A_N 1077 non-null float64
pNR2B_N 1077 non-null float64
pPKCAB_N 1077 non-null float64
pRSK_N 1077 non-null float64
AKT_N 1077 non-null float64
BRAF_N 1077 non-null float64
CAMKII_N 1077 non-null float64
CREB_N 1077 non-null float64
ELK_N 1062 non-null float64
ERK_N 1077 non-null float64
GSK3B_N 1077 non-null float64
JNK_N 1077 non-null float64
MEK_N 1073 non-null float64
TRKA_N 1077 non-null float64
RSK_N 1077 non-null float64
APP_N 1077 non-null float64
Bcatenin_N 1062 non-null float64
SOD1_N 1077 non-null float64
MTOR_N 1077 non-null float64
P38_N 1077 non-null float64
pMTOR_N 1077 non-null float64
DSCR1_N 1077 non-null float64
AMPKA_N 1077 non-null float64
NR2B_N 1077 non-null float64
pNUMB_N 1077 non-null float64
RAPTOR_N 1077 non-null float64
TIAM1_N 1077 non-null float64
pP70S6_N 1077 non-null float64
NUMB_N 1077 non-null float64
P70S6_N 1077 non-null float64
pGSK3B_N 1077 non-null float64
pPKCG_N 1077 non-null float64
CDK5_N 1077 non-null float64
S6_N 1077 non-null float64
ADARB1_N 1077 non-null float64
AcetylH3K9_N 1077 non-null float64
RRP1_N 1077 non-null float64
BAX_N 1077 non-null float64
ARC_N 1077 non-null float64
ERBB4_N 1077 non-null float64
nNOS_N 1077 non-null float64
Tau_N 1077 non-null float64
GFAP_N 1077 non-null float64
GluR3_N 1077 non-null float64
GluR4_N 1077 non-null float64
IL1B_N 1077 non-null float64
P3525_N 1077 non-null float64
pCASP9_N 1077 non-null float64
PSD95_N 1077 non-null float64
SNCA_N 1077 non-null float64
Ubiquitin_N 1077 non-null float64
pGSK3B_Tyr216_N 1077 non-null float64
SHH_N 1077 non-null float64
BAD_N 864 non-null float64
BCL2_N 792 non-null float64
pS6_N 1077 non-null float64
pCFOS_N 1002 non-null float64
SYP_N 1077 non-null float64
H3AcK18_N 897 non-null float64
EGR1_N 867 non-null float64
H3MeK4_N 807 non-null float64
CaNA_N 1077 non-null float64
Genotype 1077 non-null object
Treatment 1077 non-null object
Behavior 1077 non-null object
class 1077 non-null object
dtypes: float64(77), object(5)
memory usage: 698.4+ KB
#data1[data1['ELK_N', 'MEK_N', 'Bcatenin_N'].notna()]
data2 = data1.dropna(axis=0, subset=('ELK_N', 'MEK_N', 'Bcatenin_N'))
data2.shape
(1047, 82)
data2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1047 entries, 0 to 1079
Data columns (total 82 columns):
MouseID 1047 non-null object
DYRK1A_N 1047 non-null float64
ITSN1_N 1047 non-null float64
BDNF_N 1047 non-null float64
NR1_N 1047 non-null float64
NR2A_N 1047 non-null float64
pAKT_N 1047 non-null float64
pBRAF_N 1047 non-null float64
pCAMKII_N 1047 non-null float64
pCREB_N 1047 non-null float64
pELK_N 1047 non-null float64
pERK_N 1047 non-null float64
pJNK_N 1047 non-null float64
PKCA_N 1047 non-null float64
pMEK_N 1047 non-null float64
pNR1_N 1047 non-null float64
pNR2A_N 1047 non-null float64
pNR2B_N 1047 non-null float64
pPKCAB_N 1047 non-null float64
pRSK_N 1047 non-null float64
AKT_N 1047 non-null float64
BRAF_N 1047 non-null float64
CAMKII_N 1047 non-null float64
CREB_N 1047 non-null float64
ELK_N 1047 non-null float64
ERK_N 1047 non-null float64
GSK3B_N 1047 non-null float64
JNK_N 1047 non-null float64
MEK_N 1047 non-null float64
TRKA_N 1047 non-null float64
RSK_N 1047 non-null float64
APP_N 1047 non-null float64
Bcatenin_N 1047 non-null float64
SOD1_N 1047 non-null float64
MTOR_N 1047 non-null float64
P38_N 1047 non-null float64
pMTOR_N 1047 non-null float64
DSCR1_N 1047 non-null float64
AMPKA_N 1047 non-null float64
NR2B_N 1047 non-null float64
pNUMB_N 1047 non-null float64
RAPTOR_N 1047 non-null float64
TIAM1_N 1047 non-null float64
pP70S6_N 1047 non-null float64
NUMB_N 1047 non-null float64
P70S6_N 1047 non-null float64
pGSK3B_N 1047 non-null float64
pPKCG_N 1047 non-null float64
CDK5_N 1047 non-null float64
S6_N 1047 non-null float64
ADARB1_N 1047 non-null float64
AcetylH3K9_N 1047 non-null float64
RRP1_N 1047 non-null float64
BAX_N 1047 non-null float64
ARC_N 1047 non-null float64
ERBB4_N 1047 non-null float64
nNOS_N 1047 non-null float64
Tau_N 1047 non-null float64
GFAP_N 1047 non-null float64
GluR3_N 1047 non-null float64
GluR4_N 1047 non-null float64
IL1B_N 1047 non-null float64
P3525_N 1047 non-null float64
pCASP9_N 1047 non-null float64
PSD95_N 1047 non-null float64
SNCA_N 1047 non-null float64
Ubiquitin_N 1047 non-null float64
pGSK3B_Tyr216_N 1047 non-null float64
SHH_N 1047 non-null float64
BAD_N 849 non-null float64
BCL2_N 777 non-null float64
pS6_N 1047 non-null float64
pCFOS_N 972 non-null float64
SYP_N 1047 non-null float64
H3AcK18_N 867 non-null float64
EGR1_N 852 non-null float64
H3MeK4_N 777 non-null float64
CaNA_N 1047 non-null float64
Genotype 1047 non-null object
Treatment 1047 non-null object
Behavior 1047 non-null object
class 1047 non-null object
dtypes: float64(77), object(5)
memory usage: 678.9+ KB
#BAD_N 849 non-null float64
#BCL2_N 777 non-null float64
#pCFOS_N 972 non-null float64
#H3AcK18_N 867 non-null float64
#EGR1_N 852 non-null float64
#H3MeK4_N 777 non-null float64
for col in ['BAD_N', 'BCL2_N', 'pCFOS_N', 'H3AcK18_N', 'EGR1_N', 'H3MeK4_N']:
percent_missing = data2[col].isnull().sum() * 100 / len(data2)
print(col, 'percentage of missing values:', percent_missing)
BAD_N percentage of missing values: 18.911174785100286
BCL2_N percentage of missing values: 25.787965616045845
pCFOS_N percentage of missing values: 7.163323782234957
H3AcK18_N percentage of missing values: 17.191977077363898
EGR1_N percentage of missing values: 18.624641833810887
H3MeK4_N percentage of missing values: 25.787965616045845
Number of missing values in columns ‘BAD_N’, ‘BCL2_N’, ‘pCFOS_N’, ‘H3AcK18_N’, ‘EGR1_N’, ‘H3MeK4_N’ are all over 5%. It’s not a good idea to remove NULL in this case. We’ll replace NULL with mean.
for col in ['BAD_N', 'BCL2_N', 'pCFOS_N', 'H3AcK18_N', 'EGR1_N', 'H3MeK4_N']:
data2[col].fillna(data2[col].mean(), inplace=True)
data2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1047 entries, 0 to 1079
Data columns (total 82 columns):
MouseID 1047 non-null object
DYRK1A_N 1047 non-null float64
ITSN1_N 1047 non-null float64
BDNF_N 1047 non-null float64
NR1_N 1047 non-null float64
NR2A_N 1047 non-null float64
pAKT_N 1047 non-null float64
pBRAF_N 1047 non-null float64
pCAMKII_N 1047 non-null float64
pCREB_N 1047 non-null float64
pELK_N 1047 non-null float64
pERK_N 1047 non-null float64
pJNK_N 1047 non-null float64
PKCA_N 1047 non-null float64
pMEK_N 1047 non-null float64
pNR1_N 1047 non-null float64
pNR2A_N 1047 non-null float64
pNR2B_N 1047 non-null float64
pPKCAB_N 1047 non-null float64
pRSK_N 1047 non-null float64
AKT_N 1047 non-null float64
BRAF_N 1047 non-null float64
CAMKII_N 1047 non-null float64
CREB_N 1047 non-null float64
ELK_N 1047 non-null float64
ERK_N 1047 non-null float64
GSK3B_N 1047 non-null float64
JNK_N 1047 non-null float64
MEK_N 1047 non-null float64
TRKA_N 1047 non-null float64
RSK_N 1047 non-null float64
APP_N 1047 non-null float64
Bcatenin_N 1047 non-null float64
SOD1_N 1047 non-null float64
MTOR_N 1047 non-null float64
P38_N 1047 non-null float64
pMTOR_N 1047 non-null float64
DSCR1_N 1047 non-null float64
AMPKA_N 1047 non-null float64
NR2B_N 1047 non-null float64
pNUMB_N 1047 non-null float64
RAPTOR_N 1047 non-null float64
TIAM1_N 1047 non-null float64
pP70S6_N 1047 non-null float64
NUMB_N 1047 non-null float64
P70S6_N 1047 non-null float64
pGSK3B_N 1047 non-null float64
pPKCG_N 1047 non-null float64
CDK5_N 1047 non-null float64
S6_N 1047 non-null float64
ADARB1_N 1047 non-null float64
AcetylH3K9_N 1047 non-null float64
RRP1_N 1047 non-null float64
BAX_N 1047 non-null float64
ARC_N 1047 non-null float64
ERBB4_N 1047 non-null float64
nNOS_N 1047 non-null float64
Tau_N 1047 non-null float64
GFAP_N 1047 non-null float64
GluR3_N 1047 non-null float64
GluR4_N 1047 non-null float64
IL1B_N 1047 non-null float64
P3525_N 1047 non-null float64
pCASP9_N 1047 non-null float64
PSD95_N 1047 non-null float64
SNCA_N 1047 non-null float64
Ubiquitin_N 1047 non-null float64
pGSK3B_Tyr216_N 1047 non-null float64
SHH_N 1047 non-null float64
BAD_N 1047 non-null float64
BCL2_N 1047 non-null float64
pS6_N 1047 non-null float64
pCFOS_N 1047 non-null float64
SYP_N 1047 non-null float64
H3AcK18_N 1047 non-null float64
EGR1_N 1047 non-null float64
H3MeK4_N 1047 non-null float64
CaNA_N 1047 non-null float64
Genotype 1047 non-null object
Treatment 1047 non-null object
Behavior 1047 non-null object
class 1047 non-null object
dtypes: float64(77), object(5)
memory usage: 678.9+ KB
C:\Users\Wei\Anaconda3\lib\site-packages\pandas\core\generic.py:6287: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self._update_inplace(new_data)
# An overview of the data
data2.describe(include = np.object)
| MouseID | Genotype | Treatment | Behavior | class | |
|---|---|---|---|---|---|
| count | 1047 | 1047 | 1047 | 1047 | 1047 |
| unique | 1047 | 2 | 2 | 2 | 8 |
| top | 3477_6 | Control | Memantine | S/C | c-SC-m |
| freq | 1 | 540 | 570 | 537 | 150 |
data2.describe(include = np.number).round(3)
| DYRK1A_N | ITSN1_N | BDNF_N | NR1_N | NR2A_N | pAKT_N | pBRAF_N | pCAMKII_N | pCREB_N | pELK_N | pERK_N | pJNK_N | PKCA_N | pMEK_N | pNR1_N | pNR2A_N | pNR2B_N | pPKCAB_N | pRSK_N | AKT_N | BRAF_N | CAMKII_N | CREB_N | ELK_N | ERK_N | GSK3B_N | JNK_N | MEK_N | TRKA_N | RSK_N | APP_N | Bcatenin_N | SOD1_N | MTOR_N | P38_N | pMTOR_N | DSCR1_N | AMPKA_N | NR2B_N | pNUMB_N | RAPTOR_N | TIAM1_N | pP70S6_N | NUMB_N | P70S6_N | pGSK3B_N | pPKCG_N | CDK5_N | S6_N | ADARB1_N | AcetylH3K9_N | RRP1_N | BAX_N | ARC_N | ERBB4_N | nNOS_N | Tau_N | GFAP_N | GluR3_N | GluR4_N | IL1B_N | P3525_N | pCASP9_N | PSD95_N | SNCA_N | Ubiquitin_N | pGSK3B_Tyr216_N | SHH_N | BAD_N | BCL2_N | pS6_N | pCFOS_N | SYP_N | H3AcK18_N | EGR1_N | H3MeK4_N | CaNA_N | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 | 1047.000 |
| mean | 0.428 | 0.620 | 0.318 | 2.295 | 3.832 | 0.234 | 0.183 | 3.525 | 0.213 | 1.434 | 0.549 | 0.316 | 0.316 | 0.276 | 0.826 | 0.729 | 1.562 | 1.511 | 0.445 | 0.685 | 0.380 | 0.363 | 0.181 | 1.173 | 2.471 | 1.172 | 0.243 | 0.273 | 0.692 | 0.169 | 0.405 | 2.147 | 0.547 | 0.452 | 0.415 | 0.760 | 0.585 | 0.368 | 0.564 | 0.357 | 0.315 | 0.419 | 0.389 | 0.181 | 0.948 | 0.161 | 1.687 | 0.292 | 0.429 | 1.190 | 0.215 | 0.167 | 0.179 | 0.121 | 0.156 | 0.181 | 0.210 | 0.121 | 0.221 | 0.126 | 0.527 | 0.290 | 1.554 | 2.238 | 0.160 | 1.241 | 0.847 | 0.226 | 0.158 | 0.135 | 0.121 | 0.131 | 0.445 | 0.169 | 0.183 | 0.206 | 1.338 |
| std | 0.252 | 0.254 | 0.050 | 0.351 | 0.941 | 0.041 | 0.026 | 1.293 | 0.033 | 0.471 | 0.349 | 0.048 | 0.052 | 0.045 | 0.119 | 0.189 | 0.274 | 0.478 | 0.063 | 0.123 | 0.219 | 0.053 | 0.027 | 0.337 | 0.659 | 0.247 | 0.031 | 0.041 | 0.122 | 0.028 | 0.062 | 0.438 | 0.283 | 0.066 | 0.090 | 0.124 | 0.102 | 0.063 | 0.089 | 0.064 | 0.055 | 0.068 | 0.154 | 0.029 | 0.172 | 0.019 | 0.575 | 0.038 | 0.139 | 0.360 | 0.187 | 0.032 | 0.019 | 0.014 | 0.015 | 0.025 | 0.069 | 0.013 | 0.035 | 0.027 | 0.083 | 0.030 | 0.249 | 0.255 | 0.024 | 0.174 | 0.095 | 0.029 | 0.027 | 0.024 | 0.014 | 0.023 | 0.067 | 0.055 | 0.037 | 0.049 | 0.321 |
| min | 0.145 | 0.245 | 0.115 | 1.331 | 1.738 | 0.121 | 0.108 | 1.344 | 0.113 | 0.429 | 0.149 | 0.174 | 0.191 | 0.146 | 0.500 | 0.281 | 0.302 | 0.568 | 0.254 | 0.335 | 0.144 | 0.213 | 0.114 | 0.498 | 1.132 | 0.151 | 0.149 | 0.147 | 0.199 | 0.107 | 0.236 | 1.135 | 0.217 | 0.201 | 0.228 | 0.167 | 0.155 | 0.226 | 0.185 | 0.186 | 0.195 | 0.238 | 0.131 | 0.118 | 0.344 | 0.100 | 0.599 | 0.181 | 0.130 | 0.529 | 0.053 | -0.062 | 0.072 | 0.067 | 0.100 | 0.100 | 0.096 | 0.086 | 0.111 | 0.073 | 0.284 | 0.207 | 0.853 | 1.322 | 0.101 | 0.751 | 0.577 | 0.156 | 0.088 | 0.081 | 0.067 | 0.085 | 0.259 | 0.080 | 0.106 | 0.102 | 0.586 |
| 25% | 0.289 | 0.475 | 0.286 | 2.052 | 3.143 | 0.206 | 0.165 | 2.448 | 0.191 | 1.211 | 0.339 | 0.283 | 0.281 | 0.245 | 0.743 | 0.594 | 1.374 | 1.165 | 0.404 | 0.600 | 0.265 | 0.331 | 0.162 | 0.944 | 1.988 | 1.021 | 0.221 | 0.247 | 0.612 | 0.150 | 0.365 | 1.822 | 0.320 | 0.409 | 0.351 | 0.683 | 0.531 | 0.326 | 0.513 | 0.312 | 0.275 | 0.371 | 0.273 | 0.160 | 0.833 | 0.149 | 1.272 | 0.272 | 0.315 | 0.925 | 0.103 | 0.149 | 0.168 | 0.110 | 0.147 | 0.167 | 0.168 | 0.113 | 0.195 | 0.108 | 0.474 | 0.269 | 1.385 | 2.078 | 0.143 | 1.118 | 0.791 | 0.206 | 0.141 | 0.120 | 0.110 | 0.114 | 0.397 | 0.134 | 0.159 | 0.174 | 1.076 |
| 50% | 0.366 | 0.568 | 0.315 | 2.292 | 3.720 | 0.232 | 0.183 | 3.327 | 0.211 | 1.359 | 0.444 | 0.323 | 0.311 | 0.278 | 0.821 | 0.723 | 1.564 | 1.355 | 0.441 | 0.683 | 0.327 | 0.360 | 0.180 | 1.096 | 2.392 | 1.158 | 0.245 | 0.273 | 0.705 | 0.167 | 0.403 | 2.117 | 0.446 | 0.451 | 0.407 | 0.762 | 0.577 | 0.357 | 0.562 | 0.347 | 0.304 | 0.407 | 0.373 | 0.178 | 0.935 | 0.160 | 1.649 | 0.294 | 0.401 | 1.120 | 0.147 | 0.163 | 0.181 | 0.122 | 0.156 | 0.183 | 0.188 | 0.120 | 0.216 | 0.123 | 0.527 | 0.290 | 1.528 | 2.245 | 0.158 | 1.237 | 0.848 | 0.223 | 0.158 | 0.135 | 0.122 | 0.129 | 0.447 | 0.169 | 0.183 | 0.206 | 1.317 |
| 75% | 0.489 | 0.702 | 0.346 | 2.529 | 4.405 | 0.258 | 0.198 | 4.432 | 0.235 | 1.565 | 0.670 | 0.350 | 0.350 | 0.304 | 0.899 | 0.853 | 1.754 | 1.852 | 0.483 | 0.761 | 0.415 | 0.394 | 0.196 | 1.324 | 2.871 | 1.309 | 0.264 | 0.301 | 0.776 | 0.185 | 0.442 | 2.426 | 0.708 | 0.488 | 0.466 | 0.846 | 0.635 | 0.402 | 0.614 | 0.395 | 0.348 | 0.456 | 0.474 | 0.198 | 1.049 | 0.172 | 2.075 | 0.313 | 0.540 | 1.374 | 0.266 | 0.178 | 0.192 | 0.132 | 0.165 | 0.199 | 0.231 | 0.128 | 0.246 | 0.141 | 0.580 | 0.311 | 1.721 | 2.427 | 0.174 | 1.364 | 0.917 | 0.241 | 0.168 | 0.140 | 0.132 | 0.143 | 0.491 | 0.186 | 0.196 | 0.219 | 1.591 |
| max | 2.516 | 2.603 | 0.497 | 3.758 | 8.483 | 0.539 | 0.317 | 7.464 | 0.306 | 6.113 | 3.567 | 0.493 | 0.474 | 0.458 | 1.408 | 1.413 | 2.724 | 3.061 | 0.651 | 1.182 | 2.133 | 0.586 | 0.320 | 2.803 | 5.198 | 2.476 | 0.387 | 0.415 | 1.002 | 0.305 | 0.633 | 3.681 | 1.873 | 0.677 | 0.933 | 1.125 | 0.916 | 0.701 | 0.972 | 0.631 | 0.527 | 0.722 | 1.129 | 0.317 | 1.680 | 0.253 | 3.382 | 0.817 | 0.823 | 2.540 | 1.459 | 0.612 | 0.241 | 0.159 | 0.209 | 0.261 | 0.603 | 0.214 | 0.331 | 0.537 | 0.890 | 0.444 | 2.586 | 2.878 | 0.258 | 1.897 | 1.205 | 0.358 | 0.282 | 0.262 | 0.159 | 0.257 | 0.760 | 0.480 | 0.361 | 0.414 | 2.130 |
data3 = data2.drop(['MouseID'], axis = 1)
data3.shape
(1047, 81)
# Check Unique values in object feature
object_col = data3.columns[data3.dtypes==object].tolist()
for col in object_col:
print('The unique values and numbers of', col, 'are:')
print(data3[col].value_counts())
print('=========')
The unique values and numbers of Genotype are:
Control 540
Ts65Dn 507
Name: Genotype, dtype: int64
=========
The unique values and numbers of Treatment are:
Memantine 570
Saline 477
Name: Treatment, dtype: int64
=========
The unique values and numbers of Behavior are:
S/C 537
C/S 510
Name: Behavior, dtype: int64
=========
The unique values and numbers of class are:
c-SC-m 150
c-CS-m 150
t-CS-m 135
t-SC-m 135
t-SC-s 132
c-CS-s 120
c-SC-s 120
t-CS-s 105
Name: class, dtype: int64
=========
According to the output and the dataset description, column class contain the information in Genotype, Treatment and Behavior already. Therefore, they can be dropped in data4.
Use data4 for modelling, but we’ll use data3 for the exploration.
data4 = data3.drop(['Genotype', 'Treatment', 'Behavior'], axis = 1)
data4.shape
(1047, 78)
data4.head()
| DYRK1A_N | ITSN1_N | BDNF_N | NR1_N | NR2A_N | pAKT_N | pBRAF_N | pCAMKII_N | pCREB_N | pELK_N | pERK_N | pJNK_N | PKCA_N | pMEK_N | pNR1_N | pNR2A_N | pNR2B_N | pPKCAB_N | pRSK_N | AKT_N | BRAF_N | CAMKII_N | CREB_N | ELK_N | ERK_N | GSK3B_N | JNK_N | MEK_N | TRKA_N | RSK_N | APP_N | Bcatenin_N | SOD1_N | MTOR_N | P38_N | pMTOR_N | DSCR1_N | AMPKA_N | NR2B_N | pNUMB_N | RAPTOR_N | TIAM1_N | pP70S6_N | NUMB_N | P70S6_N | pGSK3B_N | pPKCG_N | CDK5_N | S6_N | ADARB1_N | AcetylH3K9_N | RRP1_N | BAX_N | ARC_N | ERBB4_N | nNOS_N | Tau_N | GFAP_N | GluR3_N | GluR4_N | IL1B_N | P3525_N | pCASP9_N | PSD95_N | SNCA_N | Ubiquitin_N | pGSK3B_Tyr216_N | SHH_N | BAD_N | BCL2_N | pS6_N | pCFOS_N | SYP_N | H3AcK18_N | EGR1_N | H3MeK4_N | CaNA_N | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.503644 | 0.747193 | 0.430175 | 2.816329 | 5.990152 | 0.218830 | 0.177565 | 2.373744 | 0.232224 | 1.750936 | 0.687906 | 0.306382 | 0.402698 | 0.296927 | 1.022060 | 0.605673 | 1.877684 | 2.308745 | 0.441599 | 0.859366 | 0.416289 | 0.369608 | 0.178944 | 1.866358 | 3.685247 | 1.537227 | 0.264526 | 0.319677 | 0.813866 | 0.165846 | 0.453910 | 3.037621 | 0.369510 | 0.458539 | 0.335336 | 0.825192 | 0.576916 | 0.448099 | 0.586271 | 0.394721 | 0.339571 | 0.482864 | 0.294170 | 0.182150 | 0.842725 | 0.192608 | 1.443091 | 0.294700 | 0.354605 | 1.339070 | 0.170119 | 0.159102 | 0.188852 | 0.106305 | 0.144989 | 0.176668 | 0.125190 | 0.115291 | 0.228043 | 0.142756 | 0.430957 | 0.247538 | 1.603310 | 2.014875 | 0.108234 | 1.044979 | 0.831557 | 0.188852 | 0.122652 | 0.13486 | 0.106305 | 0.108336 | 0.427099 | 0.114783 | 0.131790 | 0.128186 | 1.675652 | c-CS-m |
| 1 | 0.514617 | 0.689064 | 0.411770 | 2.789514 | 5.685038 | 0.211636 | 0.172817 | 2.292150 | 0.226972 | 1.596377 | 0.695006 | 0.299051 | 0.385987 | 0.281319 | 0.956676 | 0.587559 | 1.725774 | 2.043037 | 0.445222 | 0.834659 | 0.400364 | 0.356178 | 0.173680 | 1.761047 | 3.485287 | 1.509249 | 0.255727 | 0.304419 | 0.780504 | 0.157194 | 0.430940 | 2.921882 | 0.342279 | 0.423560 | 0.324835 | 0.761718 | 0.545097 | 0.420876 | 0.545097 | 0.368255 | 0.321959 | 0.454519 | 0.276431 | 0.182086 | 0.847615 | 0.194815 | 1.439460 | 0.294060 | 0.354548 | 1.306323 | 0.171427 | 0.158129 | 0.184570 | 0.106592 | 0.150471 | 0.178309 | 0.134275 | 0.118235 | 0.238073 | 0.142037 | 0.457156 | 0.257632 | 1.671738 | 2.004605 | 0.109749 | 1.009883 | 0.849270 | 0.200404 | 0.116682 | 0.13486 | 0.106592 | 0.104315 | 0.441581 | 0.111974 | 0.135103 | 0.131119 | 1.743610 | c-CS-m |
| 2 | 0.509183 | 0.730247 | 0.418309 | 2.687201 | 5.622059 | 0.209011 | 0.175722 | 2.283337 | 0.230247 | 1.561316 | 0.677348 | 0.291276 | 0.381002 | 0.281710 | 1.003635 | 0.602449 | 1.731873 | 2.017984 | 0.467668 | 0.814329 | 0.399847 | 0.368089 | 0.173905 | 1.765544 | 3.571456 | 1.501244 | 0.259614 | 0.311747 | 0.785154 | 0.160895 | 0.423187 | 2.944136 | 0.343696 | 0.425005 | 0.324852 | 0.757031 | 0.543620 | 0.404630 | 0.552994 | 0.363880 | 0.313086 | 0.447197 | 0.256648 | 0.184388 | 0.856166 | 0.200737 | 1.524364 | 0.301881 | 0.386087 | 1.279600 | 0.185456 | 0.148696 | 0.190532 | 0.108303 | 0.145330 | 0.176213 | 0.132560 | 0.117760 | 0.244817 | 0.142445 | 0.510472 | 0.255343 | 1.663550 | 2.016831 | 0.108196 | 0.996848 | 0.846709 | 0.193685 | 0.118508 | 0.13486 | 0.108303 | 0.106219 | 0.435777 | 0.111883 | 0.133362 | 0.127431 | 1.926427 | c-CS-m |
| 3 | 0.442107 | 0.617076 | 0.358626 | 2.466947 | 4.979503 | 0.222886 | 0.176463 | 2.152301 | 0.207004 | 1.595086 | 0.583277 | 0.296729 | 0.377087 | 0.313832 | 0.875390 | 0.520293 | 1.566852 | 2.132754 | 0.477671 | 0.727705 | 0.385639 | 0.362970 | 0.179449 | 1.286277 | 2.970137 | 1.419710 | 0.259536 | 0.279218 | 0.734492 | 0.162210 | 0.410615 | 2.500204 | 0.344509 | 0.429211 | 0.330121 | 0.746980 | 0.546763 | 0.386860 | 0.547849 | 0.366771 | 0.328492 | 0.442650 | 0.398534 | 0.161768 | 0.760234 | 0.184169 | 1.612382 | 0.296382 | 0.290680 | 1.198765 | 0.159799 | 0.166112 | 0.185323 | 0.103184 | 0.140656 | 0.163804 | 0.123210 | 0.117439 | 0.234947 | 0.145068 | 0.430996 | 0.251103 | 1.484624 | 1.957233 | 0.119883 | 0.990225 | 0.833277 | 0.192112 | 0.132781 | 0.13486 | 0.103184 | 0.111262 | 0.391691 | 0.130405 | 0.147444 | 0.146901 | 1.700563 | c-CS-m |
| 4 | 0.434940 | 0.617430 | 0.358802 | 2.365785 | 4.718679 | 0.213106 | 0.173627 | 2.134014 | 0.192158 | 1.504230 | 0.550960 | 0.286961 | 0.363502 | 0.277964 | 0.864912 | 0.507990 | 1.480059 | 2.013697 | 0.483416 | 0.687794 | 0.367531 | 0.355311 | 0.174836 | 1.324695 | 2.896334 | 1.359876 | 0.250705 | 0.273667 | 0.702699 | 0.154827 | 0.398550 | 2.456560 | 0.329126 | 0.408755 | 0.313415 | 0.691956 | 0.536860 | 0.360816 | 0.512824 | 0.351551 | 0.312206 | 0.419095 | 0.393447 | 0.160200 | 0.768113 | 0.185718 | 1.645807 | 0.296829 | 0.309345 | 1.206995 | 0.164650 | 0.160687 | 0.188221 | 0.104784 | 0.141983 | 0.167710 | 0.136838 | 0.116048 | 0.255528 | 0.140871 | 0.481227 | 0.251773 | 1.534835 | 2.009109 | 0.119524 | 0.997775 | 0.878668 | 0.205604 | 0.129954 | 0.13486 | 0.104784 | 0.110694 | 0.434154 | 0.118481 | 0.140314 | 0.148380 | 1.839730 | c-CS-m |
We end up with a data with 77 descriptive features (proteins) and one target feature (class).
Data exploration¶
import altair as alt
alt.data_transformers.disable_max_rows()
DataTransformerRegistry.enable('default')
Single column¶
import matplotlib.pyplot as plt
alt.Chart(data3, width=400).mark_bar().encode(x=alt.X('class', sort='-y'), y='count()').properties(
title='Number of Measurements for each class')
Target feature values are not quite balanced according to the bar chart.
# Boxplot for column 'DYRK1A_N'
data3.boxplot(column = 'DYRK1A_N')
plt.title("Box Plot Distribution of 'DYRK1A_N' Column")
plt.ylabel('Expression Levels')
plt.show()
The Box Plot Distribution of ‘DYRK1A_N’ Column shows that this protein level distribution is quite skewed. In particular, there are some outliers shown in the box plot. It’s unclear if they are true outliers or just skewed data, so they won’t be removed at this stage. Further investigation is needed to understand this feature more.
for-loop is used to explore the box plot distribution of 8 other columns as below.
for col in ['ITSN1_N', 'BDNF_N', 'NR1_N', 'NR2A_N', 'pAKT_N', 'pERK_N', 'pJNK_N', 'PKCA_N']:
data3.boxplot(column = col)
plt.title("Box Plot Distribution of Protein Expression Levels")
plt.ylabel('Expression Levels')
plt.show()
alt.Chart(data3).mark_bar().encode(
alt.X("pBRAF_N", bin=alt.Bin(extent=[0.1, 0.325], step=0.0125)),
y='count()').properties(title='Histogram distribution of protein expression')
Histogram of pBRAF_N and pCAMKII_N protein expression levels are right-skewed as per the output.
alt.Chart(data3).mark_bar().encode(alt.X('pCAMKII_N', bin=alt.Bin(extent=[1, 7.5], step=0.5)), y = 'count()').properties(
title='Histogram distribution of protein expression')
Relationship between pairs of attributes¶
alt.Chart(data3, width=500).mark_boxplot().encode(y='CaNA_N', x='class').properties(
title='Box Plot of CaNA_N level by Class')
alt.Chart(data3, width=300).mark_boxplot().encode(y='EGR1_N', x='Genotype').properties(
title='Box Plot of EGR1_N level by Genotype')
alt.Chart(data3, width=300).mark_boxplot().encode(y='SYP_N', x='Behavior').properties(
title='Box Plot of SYP_N level by Behavior')
alt.Chart(data3, width=300).mark_boxplot().encode(y='H3MeK4_N', x='Treatment').properties(
title='Box Plot of H3MeK4_N level by Treatment')
alt.Chart(data3).mark_point().encode(x='SNCA_N', y='Ubiquitin_N').properties(
title='Scatter plot for SNCA_N vs. Ubiquitin_N')
alt.Chart(data3).mark_point().encode(x='PSD95_N', y='pCASP9_N').properties(
title='Scatter plot for PSD95_N vs. pCASP9_N')
alt.Chart(data3).mark_point().encode(x='P3525_N', y='IL1B_N').properties(
title='Scatter plot for P3525_N vs. IL1B_N')
alt.Chart(data3).mark_point().encode(x='GluR4_N', y='GluR3_N').properties(
title='Scatter plot for GluR4_N vs. GluR3_N')
alt.Chart(data3).mark_point().encode(x='GFAP_N', y='Tau_N').properties(
title='Scatter plot for GFAP_N vs. Tau_N')
alt.Chart(data3).mark_point().encode(x='nNOS_N', y='ERBB4_N').properties(
title='Scatter plot for nNOS_N vs. ERBB4_N')
Data Modelling using two Classification models¶
Feature encoding and scaling¶
# descriptive features
Data = data4.drop(columns = 'class')
# target feature
target = data4['class']
target.shape
(1047,)
target_names = data4['class'].unique()
target.value_counts()
c-SC-m 150
c-CS-m 150
t-CS-m 135
t-SC-m 135
t-SC-s 132
c-CS-s 120
c-SC-s 120
t-CS-s 105
Name: class, dtype: int64
target = target.replace({'c-SC-m': 0, 'c-CS-m':1, 't-SC-m':2, 't-CS-m':3, 't-SC-s':4, 'c-SC-s':5, 'c-CS-s':6, 't-CS-s':7})
target.value_counts()
1 150
0 150
3 135
2 135
4 132
6 120
5 120
7 105
Name: class, dtype: int64
Data.shape
(1047, 77)
from sklearn import preprocessing
Data_df = Data.copy()
Data_scaler = preprocessing.MinMaxScaler()
Data_scaler.fit(Data)
Data = Data_scaler.fit_transform(Data)
pd.DataFrame(Data).head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.151122 | 0.212885 | 0.824638 | 0.612119 | 0.630482 | 0.234015 | 0.333862 | 0.168257 | 0.617322 | 0.232553 | 0.157643 | 0.414113 | 0.747688 | 0.482986 | 0.574775 | 0.286697 | 0.650637 | 0.698164 | 0.472442 | 0.619182 | 0.136915 | 0.419649 | 0.317149 | 0.593715 | 0.627907 | 0.596269 | 0.483960 | 0.643070 | 0.766146 | 0.295596 | 0.549817 | 0.747441 | 0.092035 | 0.541196 | 0.152338 | 0.687269 | 0.553922 | 0.467278 | 0.509996 | 0.469461 | 0.436170 | 0.506018 | 0.163368 | 0.323059 | 0.373254 | 0.604516 | 0.303363 | 0.178458 | 0.324085 | 0.402810 | 0.083584 | 0.327870 | 0.690257 | 0.426816 | 0.412721 | 0.477834 | 0.057168 | 0.228825 | 0.531163 | 0.151103 | 0.242608 | 0.169702 | 0.432843 | 0.445345 | 0.044770 | 0.256699 | 0.405228 | 0.162941 | 0.177312 | 0.299717 | 0.426816 | 0.133930 | 0.336299 | 0.087715 | 0.102890 | 0.084580 | 0.705738 |
| 1 | 0.155750 | 0.188226 | 0.776455 | 0.601070 | 0.585247 | 0.216807 | 0.311188 | 0.154925 | 0.590173 | 0.205362 | 0.159721 | 0.391151 | 0.688545 | 0.432886 | 0.502766 | 0.270688 | 0.587926 | 0.591606 | 0.481570 | 0.590036 | 0.128911 | 0.383669 | 0.291583 | 0.548032 | 0.578736 | 0.584234 | 0.446943 | 0.586180 | 0.724593 | 0.251840 | 0.491969 | 0.701976 | 0.075589 | 0.467650 | 0.137451 | 0.621033 | 0.512117 | 0.409897 | 0.457694 | 0.410046 | 0.383101 | 0.447496 | 0.145594 | 0.322736 | 0.376914 | 0.618918 | 0.302059 | 0.177452 | 0.324004 | 0.386524 | 0.084514 | 0.326426 | 0.664894 | 0.429952 | 0.463251 | 0.488028 | 0.075103 | 0.251911 | 0.576828 | 0.149555 | 0.285860 | 0.212421 | 0.472327 | 0.438745 | 0.054452 | 0.226088 | 0.433471 | 0.220010 | 0.146494 | 0.299717 | 0.429952 | 0.110434 | 0.365208 | 0.080692 | 0.115874 | 0.093977 | 0.749771 |
| 2 | 0.153459 | 0.205696 | 0.793572 | 0.558911 | 0.575910 | 0.210527 | 0.325060 | 0.153485 | 0.607102 | 0.199194 | 0.154554 | 0.366797 | 0.670905 | 0.434143 | 0.554483 | 0.283848 | 0.590443 | 0.581558 | 0.538130 | 0.566053 | 0.128651 | 0.415579 | 0.292676 | 0.549983 | 0.599925 | 0.580790 | 0.463293 | 0.613502 | 0.730384 | 0.270560 | 0.472444 | 0.710718 | 0.076445 | 0.470688 | 0.137475 | 0.616142 | 0.510175 | 0.375653 | 0.467725 | 0.400225 | 0.356363 | 0.432379 | 0.125773 | 0.334325 | 0.383316 | 0.657565 | 0.332565 | 0.189744 | 0.369553 | 0.373234 | 0.094486 | 0.312439 | 0.700212 | 0.448652 | 0.415863 | 0.475009 | 0.071718 | 0.248192 | 0.607535 | 0.150434 | 0.373879 | 0.202733 | 0.467603 | 0.446602 | 0.044526 | 0.214719 | 0.429387 | 0.186816 | 0.155920 | 0.299717 | 0.448652 | 0.121560 | 0.353621 | 0.080465 | 0.109050 | 0.082162 | 0.868229 |
| 3 | 0.125169 | 0.157688 | 0.637326 | 0.468152 | 0.480646 | 0.243717 | 0.328596 | 0.132074 | 0.486945 | 0.205135 | 0.127028 | 0.383877 | 0.657048 | 0.537247 | 0.413245 | 0.211238 | 0.522319 | 0.627585 | 0.563335 | 0.463862 | 0.121509 | 0.401866 | 0.319599 | 0.342080 | 0.452058 | 0.545716 | 0.462966 | 0.492220 | 0.667283 | 0.277208 | 0.440780 | 0.536330 | 0.076936 | 0.479533 | 0.144945 | 0.605654 | 0.514305 | 0.338199 | 0.461189 | 0.406715 | 0.402786 | 0.422989 | 0.267936 | 0.220414 | 0.311501 | 0.549444 | 0.364189 | 0.181101 | 0.231762 | 0.333033 | 0.076248 | 0.338264 | 0.669358 | 0.392700 | 0.372772 | 0.397939 | 0.053257 | 0.245676 | 0.562594 | 0.156083 | 0.242672 | 0.184790 | 0.364359 | 0.408299 | 0.119259 | 0.208943 | 0.407971 | 0.179047 | 0.229602 | 0.299717 | 0.392700 | 0.151031 | 0.265619 | 0.126763 | 0.164241 | 0.144543 | 0.721879 |
| 4 | 0.122146 | 0.157838 | 0.637787 | 0.426467 | 0.441977 | 0.220323 | 0.315055 | 0.129086 | 0.410194 | 0.189152 | 0.117572 | 0.353282 | 0.608970 | 0.422119 | 0.401706 | 0.200364 | 0.486489 | 0.579839 | 0.577813 | 0.416779 | 0.112407 | 0.381348 | 0.297196 | 0.358746 | 0.433909 | 0.519977 | 0.425815 | 0.471524 | 0.627685 | 0.239874 | 0.410395 | 0.519186 | 0.067645 | 0.436522 | 0.121261 | 0.548237 | 0.501294 | 0.283304 | 0.416698 | 0.372548 | 0.353712 | 0.374357 | 0.262839 | 0.212521 | 0.317400 | 0.559551 | 0.376199 | 0.181805 | 0.258720 | 0.337126 | 0.079697 | 0.330219 | 0.686524 | 0.410187 | 0.385008 | 0.422195 | 0.080162 | 0.234762 | 0.656300 | 0.147044 | 0.325597 | 0.187625 | 0.393332 | 0.441639 | 0.116965 | 0.215528 | 0.480342 | 0.245702 | 0.215008 | 0.299717 | 0.410187 | 0.147711 | 0.350381 | 0.096959 | 0.136298 | 0.149281 | 0.812053 |
df = pd.DataFrame(Data, columns=Data_df.columns)
df.shape
(1047, 77)
df.head()
| DYRK1A_N | ITSN1_N | BDNF_N | NR1_N | NR2A_N | pAKT_N | pBRAF_N | pCAMKII_N | pCREB_N | pELK_N | pERK_N | pJNK_N | PKCA_N | pMEK_N | pNR1_N | pNR2A_N | pNR2B_N | pPKCAB_N | pRSK_N | AKT_N | BRAF_N | CAMKII_N | CREB_N | ELK_N | ERK_N | GSK3B_N | JNK_N | MEK_N | TRKA_N | RSK_N | APP_N | Bcatenin_N | SOD1_N | MTOR_N | P38_N | pMTOR_N | DSCR1_N | AMPKA_N | NR2B_N | pNUMB_N | RAPTOR_N | TIAM1_N | pP70S6_N | NUMB_N | P70S6_N | pGSK3B_N | pPKCG_N | CDK5_N | S6_N | ADARB1_N | AcetylH3K9_N | RRP1_N | BAX_N | ARC_N | ERBB4_N | nNOS_N | Tau_N | GFAP_N | GluR3_N | GluR4_N | IL1B_N | P3525_N | pCASP9_N | PSD95_N | SNCA_N | Ubiquitin_N | pGSK3B_Tyr216_N | SHH_N | BAD_N | BCL2_N | pS6_N | pCFOS_N | SYP_N | H3AcK18_N | EGR1_N | H3MeK4_N | CaNA_N | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.151122 | 0.212885 | 0.824638 | 0.612119 | 0.630482 | 0.234015 | 0.333862 | 0.168257 | 0.617322 | 0.232553 | 0.157643 | 0.414113 | 0.747688 | 0.482986 | 0.574775 | 0.286697 | 0.650637 | 0.698164 | 0.472442 | 0.619182 | 0.136915 | 0.419649 | 0.317149 | 0.593715 | 0.627907 | 0.596269 | 0.483960 | 0.643070 | 0.766146 | 0.295596 | 0.549817 | 0.747441 | 0.092035 | 0.541196 | 0.152338 | 0.687269 | 0.553922 | 0.467278 | 0.509996 | 0.469461 | 0.436170 | 0.506018 | 0.163368 | 0.323059 | 0.373254 | 0.604516 | 0.303363 | 0.178458 | 0.324085 | 0.402810 | 0.083584 | 0.327870 | 0.690257 | 0.426816 | 0.412721 | 0.477834 | 0.057168 | 0.228825 | 0.531163 | 0.151103 | 0.242608 | 0.169702 | 0.432843 | 0.445345 | 0.044770 | 0.256699 | 0.405228 | 0.162941 | 0.177312 | 0.299717 | 0.426816 | 0.133930 | 0.336299 | 0.087715 | 0.102890 | 0.084580 | 0.705738 |
| 1 | 0.155750 | 0.188226 | 0.776455 | 0.601070 | 0.585247 | 0.216807 | 0.311188 | 0.154925 | 0.590173 | 0.205362 | 0.159721 | 0.391151 | 0.688545 | 0.432886 | 0.502766 | 0.270688 | 0.587926 | 0.591606 | 0.481570 | 0.590036 | 0.128911 | 0.383669 | 0.291583 | 0.548032 | 0.578736 | 0.584234 | 0.446943 | 0.586180 | 0.724593 | 0.251840 | 0.491969 | 0.701976 | 0.075589 | 0.467650 | 0.137451 | 0.621033 | 0.512117 | 0.409897 | 0.457694 | 0.410046 | 0.383101 | 0.447496 | 0.145594 | 0.322736 | 0.376914 | 0.618918 | 0.302059 | 0.177452 | 0.324004 | 0.386524 | 0.084514 | 0.326426 | 0.664894 | 0.429952 | 0.463251 | 0.488028 | 0.075103 | 0.251911 | 0.576828 | 0.149555 | 0.285860 | 0.212421 | 0.472327 | 0.438745 | 0.054452 | 0.226088 | 0.433471 | 0.220010 | 0.146494 | 0.299717 | 0.429952 | 0.110434 | 0.365208 | 0.080692 | 0.115874 | 0.093977 | 0.749771 |
| 2 | 0.153459 | 0.205696 | 0.793572 | 0.558911 | 0.575910 | 0.210527 | 0.325060 | 0.153485 | 0.607102 | 0.199194 | 0.154554 | 0.366797 | 0.670905 | 0.434143 | 0.554483 | 0.283848 | 0.590443 | 0.581558 | 0.538130 | 0.566053 | 0.128651 | 0.415579 | 0.292676 | 0.549983 | 0.599925 | 0.580790 | 0.463293 | 0.613502 | 0.730384 | 0.270560 | 0.472444 | 0.710718 | 0.076445 | 0.470688 | 0.137475 | 0.616142 | 0.510175 | 0.375653 | 0.467725 | 0.400225 | 0.356363 | 0.432379 | 0.125773 | 0.334325 | 0.383316 | 0.657565 | 0.332565 | 0.189744 | 0.369553 | 0.373234 | 0.094486 | 0.312439 | 0.700212 | 0.448652 | 0.415863 | 0.475009 | 0.071718 | 0.248192 | 0.607535 | 0.150434 | 0.373879 | 0.202733 | 0.467603 | 0.446602 | 0.044526 | 0.214719 | 0.429387 | 0.186816 | 0.155920 | 0.299717 | 0.448652 | 0.121560 | 0.353621 | 0.080465 | 0.109050 | 0.082162 | 0.868229 |
| 3 | 0.125169 | 0.157688 | 0.637326 | 0.468152 | 0.480646 | 0.243717 | 0.328596 | 0.132074 | 0.486945 | 0.205135 | 0.127028 | 0.383877 | 0.657048 | 0.537247 | 0.413245 | 0.211238 | 0.522319 | 0.627585 | 0.563335 | 0.463862 | 0.121509 | 0.401866 | 0.319599 | 0.342080 | 0.452058 | 0.545716 | 0.462966 | 0.492220 | 0.667283 | 0.277208 | 0.440780 | 0.536330 | 0.076936 | 0.479533 | 0.144945 | 0.605654 | 0.514305 | 0.338199 | 0.461189 | 0.406715 | 0.402786 | 0.422989 | 0.267936 | 0.220414 | 0.311501 | 0.549444 | 0.364189 | 0.181101 | 0.231762 | 0.333033 | 0.076248 | 0.338264 | 0.669358 | 0.392700 | 0.372772 | 0.397939 | 0.053257 | 0.245676 | 0.562594 | 0.156083 | 0.242672 | 0.184790 | 0.364359 | 0.408299 | 0.119259 | 0.208943 | 0.407971 | 0.179047 | 0.229602 | 0.299717 | 0.392700 | 0.151031 | 0.265619 | 0.126763 | 0.164241 | 0.144543 | 0.721879 |
| 4 | 0.122146 | 0.157838 | 0.637787 | 0.426467 | 0.441977 | 0.220323 | 0.315055 | 0.129086 | 0.410194 | 0.189152 | 0.117572 | 0.353282 | 0.608970 | 0.422119 | 0.401706 | 0.200364 | 0.486489 | 0.579839 | 0.577813 | 0.416779 | 0.112407 | 0.381348 | 0.297196 | 0.358746 | 0.433909 | 0.519977 | 0.425815 | 0.471524 | 0.627685 | 0.239874 | 0.410395 | 0.519186 | 0.067645 | 0.436522 | 0.121261 | 0.548237 | 0.501294 | 0.283304 | 0.416698 | 0.372548 | 0.353712 | 0.374357 | 0.262839 | 0.212521 | 0.317400 | 0.559551 | 0.376199 | 0.181805 | 0.258720 | 0.337126 | 0.079697 | 0.330219 | 0.686524 | 0.410187 | 0.385008 | 0.422195 | 0.080162 | 0.234762 | 0.656300 | 0.147044 | 0.325597 | 0.187625 | 0.393332 | 0.441639 | 0.116965 | 0.215528 | 0.480342 | 0.245702 | 0.215008 | 0.299717 | 0.410187 | 0.147711 | 0.350381 | 0.096959 | 0.136298 | 0.149281 | 0.812053 |
Feature selection¶
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle
new_Ind = []
cur_MaxScore = 0
col_num = 77
col_Ind_Random = shuffle(range(0, col_num), random_state=1)
for cur_f in range(0, col_num):
new_Ind.append(col_Ind_Random[cur_f])
newData = Data[:, new_Ind]
D_train, D_test, t_train, t_test = train_test_split(newData,
target,
test_size = 0.3,
random_state=0)
clf = KNeighborsClassifier(5, weights='distance', p=1)
fit=clf.fit(D_train, t_train)
cur_Score = clf.score(D_test, t_test)
if cur_Score < cur_MaxScore:
new_Ind.remove(col_Ind_Random[cur_f])
else:
cur_MaxScore = cur_Score
print("Score with " + str(len(new_Ind))+" selected features: "+str(cur_Score))
Score with 1 selected features: 0.15555555555555556
Score with 2 selected features: 0.23809523809523808
Score with 3 selected features: 0.29523809523809524
Score with 4 selected features: 0.43174603174603177
Score with 5 selected features: 0.5555555555555556
Score with 6 selected features: 0.6031746031746031
Score with 7 selected features: 0.6317460317460317
Score with 8 selected features: 0.6761904761904762
Score with 9 selected features: 0.7111111111111111
Score with 10 selected features: 0.7428571428571429
Score with 11 selected features: 0.7841269841269841
Score with 12 selected features: 0.8825396825396825
Score with 13 selected features: 0.8857142857142857
Score with 14 selected features: 0.9047619047619048
Score with 15 selected features: 0.9238095238095239
Score with 16 selected features: 0.9428571428571428
Score with 17 selected features: 0.9492063492063492
Score with 18 selected features: 0.9492063492063492
Score with 19 selected features: 0.9555555555555556
Score with 20 selected features: 0.9619047619047619
Score with 21 selected features: 0.9619047619047619
Score with 22 selected features: 0.9650793650793651
Score with 23 selected features: 0.9650793650793651
Score with 24 selected features: 0.9682539682539683
Score with 25 selected features: 0.9746031746031746
Score with 26 selected features: 0.9746031746031746
Score with 27 selected features: 0.9746031746031746
Score with 28 selected features: 0.9777777777777777
Score with 29 selected features: 0.9809523809523809
Score with 30 selected features: 0.9809523809523809
Score with 31 selected features: 0.9809523809523809
Score with 32 selected features: 0.9809523809523809
Score with 33 selected features: 0.9809523809523809
Score with 34 selected features: 0.9873015873015873
Score with 35 selected features: 0.9873015873015873
Score with 36 selected features: 0.9873015873015873
Score with 37 selected features: 0.9873015873015873
Score with 38 selected features: 0.9873015873015873
Score with 39 selected features: 0.9904761904761905
Score with 40 selected features: 0.9904761904761905
Score with 41 selected features: 0.9904761904761905
Score with 42 selected features: 0.9904761904761905
Score with 43 selected features: 0.9904761904761905
Score with 44 selected features: 0.9904761904761905
Score with 45 selected features: 0.9904761904761905
Score with 46 selected features: 0.9904761904761905
Score with 47 selected features: 0.9968253968253968
print("There are " + str(len(new_Ind)) + " features selected:")
There are 47 features selected:
print(new_Ind)
[31, 43, 26, 74, 58, 59, 61, 51, 36, 57, 10, 53, 71, 46, 15, 19, 76, 63, 75, 62, 38, 67, 44, 56, 39, 47, 48, 7, 73, 52, 13, 32, 30, 49, 54, 23, 4, 14, 65, 20, 50, 25, 6, 1, 64, 9, 12]
dataset = pd.DataFrame(Data[:, new_Ind], columns=Data_df.columns[new_Ind])
dataset.shape
(1047, 47)
dataset.head()
| Bcatenin_N | NUMB_N | JNK_N | EGR1_N | GluR3_N | GluR4_N | P3525_N | RRP1_N | DSCR1_N | GFAP_N | pERK_N | ARC_N | pCFOS_N | pPKCG_N | pNR2A_N | AKT_N | CaNA_N | PSD95_N | H3MeK4_N | pCASP9_N | NR2B_N | SHH_N | P70S6_N | Tau_N | pNUMB_N | CDK5_N | S6_N | pCAMKII_N | H3AcK18_N | BAX_N | pMEK_N | SOD1_N | APP_N | ADARB1_N | ERBB4_N | ELK_N | NR2A_N | pNR1_N | Ubiquitin_N | BRAF_N | AcetylH3K9_N | GSK3B_N | pBRAF_N | ITSN1_N | SNCA_N | pELK_N | PKCA_N | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.747441 | 0.323059 | 0.483960 | 0.102890 | 0.531163 | 0.151103 | 0.169702 | 0.327870 | 0.553922 | 0.228825 | 0.157643 | 0.426816 | 0.133930 | 0.303363 | 0.286697 | 0.619182 | 0.705738 | 0.445345 | 0.084580 | 0.432843 | 0.509996 | 0.162941 | 0.373254 | 0.057168 | 0.469461 | 0.178458 | 0.324085 | 0.168257 | 0.087715 | 0.690257 | 0.482986 | 0.092035 | 0.549817 | 0.402810 | 0.412721 | 0.593715 | 0.630482 | 0.574775 | 0.256699 | 0.136915 | 0.083584 | 0.596269 | 0.333862 | 0.212885 | 0.044770 | 0.232553 | 0.747688 |
| 1 | 0.701976 | 0.322736 | 0.446943 | 0.115874 | 0.576828 | 0.149555 | 0.212421 | 0.326426 | 0.512117 | 0.251911 | 0.159721 | 0.429952 | 0.110434 | 0.302059 | 0.270688 | 0.590036 | 0.749771 | 0.438745 | 0.093977 | 0.472327 | 0.457694 | 0.220010 | 0.376914 | 0.075103 | 0.410046 | 0.177452 | 0.324004 | 0.154925 | 0.080692 | 0.664894 | 0.432886 | 0.075589 | 0.491969 | 0.386524 | 0.463251 | 0.548032 | 0.585247 | 0.502766 | 0.226088 | 0.128911 | 0.084514 | 0.584234 | 0.311188 | 0.188226 | 0.054452 | 0.205362 | 0.688545 |
| 2 | 0.710718 | 0.334325 | 0.463293 | 0.109050 | 0.607535 | 0.150434 | 0.202733 | 0.312439 | 0.510175 | 0.248192 | 0.154554 | 0.448652 | 0.121560 | 0.332565 | 0.283848 | 0.566053 | 0.868229 | 0.446602 | 0.082162 | 0.467603 | 0.467725 | 0.186816 | 0.383316 | 0.071718 | 0.400225 | 0.189744 | 0.369553 | 0.153485 | 0.080465 | 0.700212 | 0.434143 | 0.076445 | 0.472444 | 0.373234 | 0.415863 | 0.549983 | 0.575910 | 0.554483 | 0.214719 | 0.128651 | 0.094486 | 0.580790 | 0.325060 | 0.205696 | 0.044526 | 0.199194 | 0.670905 |
| 3 | 0.536330 | 0.220414 | 0.462966 | 0.164241 | 0.562594 | 0.156083 | 0.184790 | 0.338264 | 0.514305 | 0.245676 | 0.127028 | 0.392700 | 0.151031 | 0.364189 | 0.211238 | 0.463862 | 0.721879 | 0.408299 | 0.144543 | 0.364359 | 0.461189 | 0.179047 | 0.311501 | 0.053257 | 0.406715 | 0.181101 | 0.231762 | 0.132074 | 0.126763 | 0.669358 | 0.537247 | 0.076936 | 0.440780 | 0.333033 | 0.372772 | 0.342080 | 0.480646 | 0.413245 | 0.208943 | 0.121509 | 0.076248 | 0.545716 | 0.328596 | 0.157688 | 0.119259 | 0.205135 | 0.657048 |
| 4 | 0.519186 | 0.212521 | 0.425815 | 0.136298 | 0.656300 | 0.147044 | 0.187625 | 0.330219 | 0.501294 | 0.234762 | 0.117572 | 0.410187 | 0.147711 | 0.376199 | 0.200364 | 0.416779 | 0.812053 | 0.441639 | 0.149281 | 0.393332 | 0.416698 | 0.245702 | 0.317400 | 0.080162 | 0.372548 | 0.181805 | 0.258720 | 0.129086 | 0.096959 | 0.686524 | 0.422119 | 0.067645 | 0.410395 | 0.337126 | 0.385008 | 0.358746 | 0.441977 | 0.401706 | 0.215528 | 0.112407 | 0.079697 | 0.519977 | 0.315055 | 0.157838 | 0.116965 | 0.189152 | 0.608970 |
D_train, D_test, t_train, t_test = train_test_split(dataset,
target,
test_size=0.3,
stratify=target.values,
random_state=999)
print(D_train.shape)
print(D_test.shape)
print(t_train.shape)
print(t_test.shape)
(732, 47)
(315, 47)
(732,)
(315,)
KNN¶
There are several hyperparameters for KNN, for instance
number of neighbors
weights
the type of distance p
Build the KNN classifier with 5 neighbors
KNN_5 = KNeighborsClassifier(5)
fit = KNN_5.fit(D_train, t_train)
t_pre = fit.predict(D_test)
t_pre.shape
(315,)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(t_test, t_pre)
print(cm)
[[43 0 2 0 0 0 0 0]
[ 0 45 0 0 0 0 0 0]
[ 0 0 41 0 0 0 0 0]
[ 0 0 0 40 0 0 0 1]
[ 0 0 0 0 40 0 0 0]
[ 0 0 0 0 0 36 0 0]
[ 0 1 0 0 0 0 34 1]
[ 0 0 0 1 0 0 0 30]]
from sklearn.metrics import classification_report
print(classification_report(t_test, t_pre))
precision recall f1-score support
0 1.00 0.96 0.98 45
1 0.98 1.00 0.99 45
2 0.95 1.00 0.98 41
3 0.98 0.98 0.98 41
4 1.00 1.00 1.00 40
5 1.00 1.00 1.00 36
6 1.00 0.94 0.97 36
7 0.94 0.97 0.95 31
accuracy 0.98 315
macro avg 0.98 0.98 0.98 315
weighted avg 0.98 0.98 0.98 315
Hyperparameter tunning¶
While keeping ‘n_neighbors = 5’, different parameters are set for ‘weights’ and ‘p’.
KNN_5_w = KNeighborsClassifier(5, weights = 'distance')
fit = KNN_5_w.fit(D_train, t_train)
t_pre = fit.predict(D_test)
cm = confusion_matrix(t_test, t_pre)
print(cm)
print(classification_report(t_test, t_pre))
[[45 0 0 0 0 0 0 0]
[ 0 45 0 0 0 0 0 0]
[ 0 0 41 0 0 0 0 0]
[ 0 0 0 40 0 0 0 1]
[ 0 0 0 0 40 0 0 0]
[ 0 0 0 0 0 36 0 0]
[ 0 0 0 0 0 0 35 1]
[ 0 0 0 1 0 0 0 30]]
precision recall f1-score support
0 1.00 1.00 1.00 45
1 1.00 1.00 1.00 45
2 1.00 1.00 1.00 41
3 0.98 0.98 0.98 41
4 1.00 1.00 1.00 40
5 1.00 1.00 1.00 36
6 1.00 0.97 0.99 36
7 0.94 0.97 0.95 31
accuracy 0.99 315
macro avg 0.99 0.99 0.99 315
weighted avg 0.99 0.99 0.99 315
It seems that weights helped with the accuracy. While keeping’n_neighbors = 5’ and weights, we set p value to 1.
KNN_5_w_1 = KNeighborsClassifier(5, weights = 'distance', p = 1)
fit = KNN_5_w_1.fit(D_train, t_train)
t_pre = fit.predict(D_test)
cm = confusion_matrix(t_test, t_pre)
print(cm)
print(classification_report(t_test, t_pre))
[[45 0 0 0 0 0 0 0]
[ 0 45 0 0 0 0 0 0]
[ 0 0 41 0 0 0 0 0]
[ 0 0 0 41 0 0 0 0]
[ 0 0 0 0 40 0 0 0]
[ 1 0 0 0 0 35 0 0]
[ 0 0 0 0 0 0 35 1]
[ 0 0 0 1 0 0 0 30]]
precision recall f1-score support
0 0.98 1.00 0.99 45
1 1.00 1.00 1.00 45
2 1.00 1.00 1.00 41
3 0.98 1.00 0.99 41
4 1.00 1.00 1.00 40
5 1.00 0.97 0.99 36
6 1.00 0.97 0.99 36
7 0.97 0.97 0.97 31
accuracy 0.99 315
macro avg 0.99 0.99 0.99 315
weighted avg 0.99 0.99 0.99 315
p=1 or p=2 produces similar accuracy rate. Let’s try lower number of neighbors with p = 2.
KNN = KNeighborsClassifier(4, weights = 'distance', p = 2)
fit = KNN.fit(D_train, t_train)
t_pre = fit.predict(D_test)
cm = confusion_matrix(t_test, t_pre)
print(cm)
print(classification_report(t_test, t_pre))
[[45 0 0 0 0 0 0 0]
[ 0 45 0 0 0 0 0 0]
[ 0 0 41 0 0 0 0 0]
[ 0 0 0 40 0 0 0 1]
[ 0 0 0 0 40 0 0 0]
[ 0 0 0 0 0 36 0 0]
[ 0 0 0 0 0 0 35 1]
[ 0 0 0 1 0 0 0 30]]
precision recall f1-score support
0 1.00 1.00 1.00 45
1 1.00 1.00 1.00 45
2 1.00 1.00 1.00 41
3 0.98 0.98 0.98 41
4 1.00 1.00 1.00 40
5 1.00 1.00 1.00 36
6 1.00 0.97 0.99 36
7 0.94 0.97 0.95 31
accuracy 0.99 315
macro avg 0.99 0.99 0.99 315
weighted avg 0.99 0.99 0.99 315
Let’s use grid search to see what’s the best number of neighbors to usem
Using grid search method¶
from sklearn.model_selection import StratifiedKFold, GridSearchCV
cv_method = StratifiedKFold(n_splits=3,shuffle=True, random_state=999)
# define the parameter values
KNN_para = {'n_neighbors': [1, 2, 3, 4, 5,6, 7], 'p': [1, 2]}
KNN_gs = GridSearchCV(KNeighborsClassifier(weights = 'distance'), KNN_para, cv=cv_method, scoring = 'accuracy')
KNN_gs.fit(D_train, t_train)
GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=999, shuffle=True),
error_score='raise-deprecating',
estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski',
metric_params=None, n_jobs=None,
n_neighbors=5, p=2,
weights='distance'),
iid='warn', n_jobs=None,
param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7], 'p': [1, 2]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='accuracy', verbose=0)
KNN_gs.best_params_
{'n_neighbors': 1, 'p': 2}
KNN_gs.best_estimator_
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=1, p=2,
weights='distance')
KNN_gs.best_score_
0.98224043715847
KNN_results=pd.DataFrame(KNN_gs.cv_results_)[['mean_test_score', 'std_test_score', 'params']]
KNN_results_sorted=KNN_results.sort_values('mean_test_score', ascending=False)
KNN_results_sorted.head()
| mean_test_score | std_test_score | params | |
|---|---|---|---|
| 1 | 0.982240 | 0.003959 | {'n_neighbors': 1, 'p': 2} |
| 3 | 0.982240 | 0.003959 | {'n_neighbors': 2, 'p': 2} |
| 0 | 0.978142 | 0.007871 | {'n_neighbors': 1, 'p': 1} |
| 2 | 0.978142 | 0.007871 | {'n_neighbors': 2, 'p': 1} |
| 7 | 0.965847 | 0.010961 | {'n_neighbors': 4, 'p': 2} |
Using the best method from grid search and test it on testing set
clf = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=2, p=2,
weights='uniform')
fit = clf.fit(D_train, t_train)
t_pre = fit.predict(D_test)
cm = confusion_matrix(t_test, t_pre)
print(cm)
print(classification_report(t_test, t_pre))
[[45 0 0 0 0 0 0 0]
[ 0 45 0 0 0 0 0 0]
[ 0 0 41 0 0 0 0 0]
[ 0 0 0 41 0 0 0 0]
[ 0 0 0 0 40 0 0 0]
[ 1 0 0 0 0 35 0 0]
[ 0 0 0 0 0 0 36 0]
[ 0 0 0 1 0 0 0 30]]
precision recall f1-score support
0 0.98 1.00 0.99 45
1 1.00 1.00 1.00 45
2 1.00 1.00 1.00 41
3 0.98 1.00 0.99 41
4 1.00 1.00 1.00 40
5 1.00 0.97 0.99 36
6 1.00 1.00 1.00 36
7 1.00 0.97 0.98 31
accuracy 0.99 315
macro avg 0.99 0.99 0.99 315
weighted avg 0.99 0.99 0.99 315
Better results than others. However, t test will need to be conducted to know if this model is significantly better than others. Also, when k =1, the model is likely to be overfitting.
DecisionTree¶
Train and evaluate the model appropriately.
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state = 999)
fit = clf.fit(D_train, t_train)
t_pre = fit.predict(D_test)
t_pre.shape
(315,)
cm = confusion_matrix(t_test, t_pre)
print(cm)
print(classification_report(t_test, t_pre))
print(clf.tree_.max_depth)
[[37 0 2 0 3 1 2 0]
[ 0 41 0 1 0 0 1 2]
[ 2 0 37 0 0 2 0 0]
[ 1 3 0 33 0 0 2 2]
[ 0 0 0 0 39 0 1 0]
[ 0 0 1 0 0 35 0 0]
[ 0 3 0 9 0 0 22 2]
[ 0 6 0 4 0 0 0 21]]
precision recall f1-score support
0 0.93 0.82 0.87 45
1 0.77 0.91 0.84 45
2 0.93 0.90 0.91 41
3 0.70 0.80 0.75 41
4 0.93 0.97 0.95 40
5 0.92 0.97 0.95 36
6 0.79 0.61 0.69 36
7 0.78 0.68 0.72 31
accuracy 0.84 315
macro avg 0.84 0.83 0.83 315
weighted avg 0.84 0.84 0.84 315
12
Set entropy as the criterion
clf = DecisionTreeClassifier(criterion = 'entropy',random_state = 999)
fit = clf.fit(D_train, t_train)
t_pre = fit.predict(D_test)
t_pre.shape
cm = confusion_matrix(t_test, t_pre)
print(cm)
print(classification_report(t_test, t_pre))
print(clf.tree_.max_depth)
[[34 0 4 0 3 1 1 2]
[ 0 39 0 2 0 0 2 2]
[ 1 0 40 0 0 0 0 0]
[ 0 5 0 33 0 0 1 2]
[ 0 1 0 0 39 0 0 0]
[ 0 0 1 0 0 35 0 0]
[ 0 5 0 5 0 0 22 4]
[ 0 5 0 1 0 0 0 25]]
precision recall f1-score support
0 0.97 0.76 0.85 45
1 0.71 0.87 0.78 45
2 0.89 0.98 0.93 41
3 0.80 0.80 0.80 41
4 0.93 0.97 0.95 40
5 0.97 0.97 0.97 36
6 0.85 0.61 0.71 36
7 0.71 0.81 0.76 31
accuracy 0.85 315
macro avg 0.85 0.85 0.84 315
weighted avg 0.86 0.85 0.85 315
9
clf = DecisionTreeClassifier(criterion = 'entropy',min_samples_split = 10, random_state = 999)
fit = clf.fit(D_train, t_train)
t_pre = fit.predict(D_test)
t_pre.shape
cm = confusion_matrix(t_test, t_pre)
print(cm)
print(classification_report(t_test, t_pre))
print(clf.tree_.max_depth)
[[34 0 3 0 3 1 1 3]
[ 0 35 0 5 0 0 4 1]
[ 4 0 37 0 0 0 0 0]
[ 0 5 0 25 0 0 7 4]
[ 0 0 0 0 40 0 0 0]
[ 1 0 1 0 0 34 0 0]
[ 0 5 0 3 2 0 21 5]
[ 0 6 0 2 0 0 1 22]]
precision recall f1-score support
0 0.87 0.76 0.81 45
1 0.69 0.78 0.73 45
2 0.90 0.90 0.90 41
3 0.71 0.61 0.66 41
4 0.89 1.00 0.94 40
5 0.97 0.94 0.96 36
6 0.62 0.58 0.60 36
7 0.63 0.71 0.67 31
accuracy 0.79 315
macro avg 0.79 0.79 0.78 315
weighted avg 0.79 0.79 0.79 315
8
Using entropy slightly improve the performance. Now set max_depth as 5.
clf = DecisionTreeClassifier(criterion = 'entropy',max_depth = 5,min_samples_split = 10, random_state = 999)
fit = clf.fit(D_train, t_train)
t_pre = fit.predict(D_test)
t_pre.shape
cm = confusion_matrix(t_test, t_pre)
print(cm)
print(classification_report(t_test, t_pre))
print(clf.tree_.max_depth)
[[34 0 2 0 3 2 4 0]
[ 0 30 0 11 0 0 2 2]
[ 4 0 33 0 2 2 0 0]
[ 0 6 0 27 0 0 5 3]
[ 0 0 0 0 38 0 0 2]
[ 0 0 1 0 0 35 0 0]
[ 0 8 0 8 0 0 17 3]
[ 0 5 0 9 0 0 0 17]]
precision recall f1-score support
0 0.89 0.76 0.82 45
1 0.61 0.67 0.64 45
2 0.92 0.80 0.86 41
3 0.49 0.66 0.56 41
4 0.88 0.95 0.92 40
5 0.90 0.97 0.93 36
6 0.61 0.47 0.53 36
7 0.63 0.55 0.59 31
accuracy 0.73 315
macro avg 0.74 0.73 0.73 315
weighted avg 0.74 0.73 0.73 315
5
clf = DecisionTreeClassifier(max_depth = 5, min_samples_split = 10, min_samples_leaf=10, random_state = 999)
fit = clf.fit(D_train, t_train)
t_pre = fit.predict(D_test)
t_pre.shape
cm = confusion_matrix(t_test, t_pre)
print(cm)
print(classification_report(t_test, t_pre))
print(clf.tree_.max_depth)
[[32 1 2 0 5 2 0 3]
[ 0 35 0 6 0 0 1 3]
[ 2 0 33 0 4 2 0 0]
[ 0 2 0 28 0 0 0 11]
[ 0 0 0 0 38 0 0 2]
[ 0 0 3 0 0 33 0 0]
[ 0 6 0 10 0 0 14 6]
[ 0 4 0 7 0 0 1 19]]
precision recall f1-score support
0 0.94 0.71 0.81 45
1 0.73 0.78 0.75 45
2 0.87 0.80 0.84 41
3 0.55 0.68 0.61 41
4 0.81 0.95 0.87 40
5 0.89 0.92 0.90 36
6 0.88 0.39 0.54 36
7 0.43 0.61 0.51 31
accuracy 0.74 315
macro avg 0.76 0.73 0.73 315
weighted avg 0.77 0.74 0.74 315
5
params_DT = {'max_depth': [5, 8, 10, 12], 'min_samples_split': [5, 10, 15], 'min_samples_leaf': [5, 10, 15]}
gs_DT = GridSearchCV(DecisionTreeClassifier(criterion = 'entropy'),
param_grid=params_DT,
cv=cv_method,
scoring='accuracy')
gs_DT.fit(D_train, t_train)
C:\Users\Wei\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:814: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
DeprecationWarning)
GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=999, shuffle=True),
error_score='raise-deprecating',
estimator=DecisionTreeClassifier(class_weight=None,
criterion='entropy',
max_depth=None, max_features=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
presort=False, random_state=None,
splitter='best'),
iid='warn', n_jobs=None,
param_grid={'max_depth': [5, 8, 10, 12],
'min_samples_leaf': [5, 10, 15],
'min_samples_split': [5, 10, 15]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='accuracy', verbose=0)
gs_DT.best_params_
{'max_depth': 12, 'min_samples_leaf': 5, 'min_samples_split': 5}
gs_DT.best_estimator_
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=12,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=5, min_samples_split=5,
min_weight_fraction_leaf=0.0, presort=False,
random_state=None, splitter='best')
gs_DT.best_score_
0.7718579234972678
DT_results=pd.DataFrame(gs_DT.cv_results_)[['mean_test_score', 'std_test_score', 'params']]
DT_results_sorted=DT_results.sort_values('mean_test_score', ascending=False)
DT_results_sorted.head()
| mean_test_score | std_test_score | params | |
|---|---|---|---|
| 27 | 0.771858 | 0.004279 | {'max_depth': 12, 'min_samples_leaf': 5, 'min_... |
| 20 | 0.762295 | 0.007632 | {'max_depth': 10, 'min_samples_leaf': 5, 'min_... |
| 9 | 0.762295 | 0.010088 | {'max_depth': 8, 'min_samples_leaf': 5, 'min_s... |
| 18 | 0.760929 | 0.008506 | {'max_depth': 10, 'min_samples_leaf': 5, 'min_... |
| 19 | 0.760929 | 0.005822 | {'max_depth': 10, 'min_samples_leaf': 5, 'min_... |
clf = DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=8,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=5, min_samples_split=10,
min_weight_fraction_leaf=0.0, presort=False,
random_state=None, splitter='best')
fit = clf.fit(D_train, t_train)
t_pre = fit.predict(D_test)
t_pre.shape
cm = confusion_matrix(t_test, t_pre)
print(cm)
print(classification_report(t_test, t_pre))
print(clf.tree_.max_depth)
[[33 0 2 0 3 1 3 3]
[ 0 35 0 5 0 0 3 2]
[ 3 0 36 0 0 0 2 0]
[ 0 5 0 26 0 0 6 4]
[ 0 2 0 0 38 0 0 0]
[ 1 0 1 0 0 34 0 0]
[ 0 8 0 4 0 0 23 1]
[ 0 5 0 3 0 0 1 22]]
precision recall f1-score support
0 0.89 0.73 0.80 45
1 0.64 0.78 0.70 45
2 0.92 0.88 0.90 41
3 0.68 0.63 0.66 41
4 0.93 0.95 0.94 40
5 0.97 0.94 0.96 36
6 0.61 0.64 0.62 36
7 0.69 0.71 0.70 31
accuracy 0.78 315
macro avg 0.79 0.78 0.78 315
weighted avg 0.79 0.78 0.79 315
8
from sklearn import tree
with open('mice_protein.dot', 'w') as f:
f=tree.export_graphviz(clf, out_file=f, feature_names = dataset.columns,
class_names=target_names, filled=True, rounded=True,
special_characters=True)